1 Setup

Set the working dir

setwd("/mnt/picea/projects/spruce/u2015029/20170824")

Load libraries

suppressPackageStartupMessages(library(DESeq2))
suppressPackageStartupMessages(library(gplots))
suppressPackageStartupMessages(library(parallel))
suppressPackageStartupMessages(library(pander))
suppressPackageStartupMessages(library(RColorBrewer))
suppressPackageStartupMessages(library(scatterplot3d))
suppressPackageStartupMessages(library(tximport))
suppressPackageStartupMessages(library(vsn))

Source some helper functions

source("~/Git/UPSCb/src/R/plot.multidensity.R")
source("~/Git/UPSCb/src/R/featureSelection.R")

Create palettes

pal <- brewer.pal(8,"Dark2")
pal12 <- brewer.pal(12,"Paired")
hpal <- colorRampPalette(c("blue","white","red"))(100)

Register the default plot margin

mar <- par("mar")

2 Raw data

2.1 Loading

Read the sample information

samples <- read.csv2("~/Git/UPSCb/projects/spruce-cone-development/doc/All_samples.csv")

2.1.1 Original data

orig <- list.files("kallisto", 
                    recursive = TRUE, 
                    pattern = "abundance.tsv",
                    full.names = TRUE)

name them

names(orig) <- sub("_sortmerna.*","",
                    sapply(strsplit(orig, "/"), .subset, 2))

Reorder the sample data.frame according to the way the results were read in and accounting for tech reps

samples <- samples[match(names(orig),samples$Complete_SciLifeID),]

Read the expression at the transcript level

tx <- suppressMessages(tximport(files = orig, 
                                type = "kallisto", 
                                txOut = TRUE))
kg <- round(tx$counts)

2.2 Raw Data QC analysis

2.2.1 Original data

Check how many genes are never expressed

sel <- rowSums(kg) == 0
sprintf("%s%% percent (%s) of %s genes are not expressed",
        round(sum(sel) * 100/ nrow(kg),digits=1),
        sum(sel),
        nrow(kg))
## [1] "14.3% percent (9543) of 66632 genes are not expressed"

The cumulative gene coverage is as expected

plot(density(log10(rowMeans(kg))),col=pal[1],
     main="gene mean raw counts distribution",
     xlab="mean raw counts (log10)")

The same is done for the individual samples colored by sex.

plot.multidensity(lapply(1:ncol(kg),function(k){log10(kg)[,k]}),
                  col=c(1,pal)[as.integer(samples$Sex)],
                  legend.x="topright",
                  legend=levels(samples$Sex),
                  legend.col=c(1,pal)[1:nlevels(samples$Sex)],
                  legend.lwd=2,
                  main="sample raw counts distribution",
                  xlab="per gene raw counts (log10)")

and by batch.

plot.multidensity(lapply(1:ncol(kg),function(k){log10(kg)[,k]}),
                  col=pal[as.integer(samples$Batch)],
                  legend.x="topright",
                  legend=levels(samples$Batch),
                  legend.col=pal[1:nlevels(samples$Batch)],
                  legend.lwd=2,
                  main="sample raw counts distribution",
                  xlab="per gene raw counts (log10)")

2.3 Raw data export

dir.create(file.path("analysis","kallisto"),showWarnings=FALSE)
write.csv(kg,file="analysis/kallisto/raw-unormalised-gene-expression_data.csv")

3 Data normalisation

3.1 Preparation

For visualization, the data is submitted to a variance stabilization transformation using DESeq2. The dispersion is estimated independently of the sample tissue and replicate ### Original data Create the dds object, without giving any prior on the design

dds.kg <- DESeqDataSetFromMatrix(
  countData = kg,
  colData = samples[,c("Complete_SciLifeID","ID","Biol_repl_ID","Sex","Type","Sampling","Tree")],
  design = ~Complete_SciLifeID)
## converting counts to integer mode

Check the size factors (i.e. the sequencing library size effect)

dds.kg <- estimateSizeFactors(dds.kg)
sizes.kg <- sizeFactors(dds.kg)
names(sizes.kg) <- colnames(kg)
pander(sizes.kg)
Table continues below
1_150115_BC6L4PANXX_P1387_101 1_150115_BC6L4PANXX_P1387_102
0.9992 1.061
Table continues below
1_150115_BC6L4PANXX_P1387_103 1_150115_BC6L4PANXX_P1387_104
0.9865 0.8314
Table continues below
1_150115_BC6L4PANXX_P1387_105 1_150115_BC6L4PANXX_P1387_106
0.9708 1.109
Table continues below
1_150115_BC6L4PANXX_P1387_107 1_150115_BC6L4PANXX_P1387_108
1.243 0.9822
Table continues below
1_150115_BC6L4PANXX_P1387_109 1_150115_BC6L4PANXX_P1387_110
0.9398 0.9635
Table continues below
1_150115_BC6L4PANXX_P1387_111 1_150115_BC6L4PANXX_P1387_112
0.7944 1.002
Table continues below
1_150115_BC6L4PANXX_P1387_113 1_150115_BC6L4PANXX_P1387_114
0.8913 1.064
Table continues below
1_150115_BC6L4PANXX_P1387_115 1_150115_BC6L4PANXX_P1387_116
1.193 1.088
Table continues below
1_150115_BC6L4PANXX_P1387_117 1_150115_BC6L4PANXX_P1387_118
0.9988 1.007
Table continues below
1_150115_BC6L4PANXX_P1387_119 1_150115_BC6L4PANXX_P1387_120
0.7673 0.87
Table continues below
1_150115_BC6L4PANXX_P1387_121 1_150115_BC6L4PANXX_P1387_122
1.004 1.028
Table continues below
1_150115_BC6L4PANXX_P1387_123 1_150115_BC6L4PANXX_P1387_124
1.316 0.8749
Table continues below
1_150115_BC6L4PANXX_P1387_125 2_150115_BC6L4PANXX_P1387_101
0.9088 1.002
Table continues below
2_150115_BC6L4PANXX_P1387_102 2_150115_BC6L4PANXX_P1387_103
1.069 1.004
Table continues below
2_150115_BC6L4PANXX_P1387_104 2_150115_BC6L4PANXX_P1387_105
0.8406 0.9906
Table continues below
2_150115_BC6L4PANXX_P1387_106 2_150115_BC6L4PANXX_P1387_107
1.115 1.241
Table continues below
2_150115_BC6L4PANXX_P1387_108 2_150115_BC6L4PANXX_P1387_109
0.9853 0.9448
Table continues below
2_150115_BC6L4PANXX_P1387_110 2_150115_BC6L4PANXX_P1387_111
0.9616 0.8109
Table continues below
2_150115_BC6L4PANXX_P1387_112 2_150115_BC6L4PANXX_P1387_113
1.017 0.91
Table continues below
2_150115_BC6L4PANXX_P1387_114 2_150115_BC6L4PANXX_P1387_115
1.071 1.195
Table continues below
2_150115_BC6L4PANXX_P1387_116 2_150115_BC6L4PANXX_P1387_117
1.09 1.011
Table continues below
2_150115_BC6L4PANXX_P1387_118 2_150115_BC6L4PANXX_P1387_119
1.008 0.7804
Table continues below
2_150115_BC6L4PANXX_P1387_120 2_150115_BC6L4PANXX_P1387_121
0.8813 1.029
Table continues below
2_150115_BC6L4PANXX_P1387_122 2_150115_BC6L4PANXX_P1387_123
1.035 1.322
Table continues below
2_150115_BC6L4PANXX_P1387_124 2_150115_BC6L4PANXX_P1387_125 A10_S10_L001
0.8803 0.9183 0.6457
Table continues below
A10_S2_L001 A11_S11_L001 A11_S3_L001 A12_S12_L001 A13_S13_L001
1.035 0.6138 0.9185 1.736 2.439
Table continues below
A14_S14_L002 A14_S4_L001 A15_S15_L002 A16_S16_L002 A16_S5_L001
0.8509 1.172 2.676 0.75 0.9613
Table continues below
A17_S1_L001 A18_S18_L002 A19_S19_L002 A1_S1_L001 A20_S20_L002
0.1163 2.023 2.948 1.612 1.863
Table continues below
A21_S21_L002 A22_S22_L002 A23_S23_L002 A24_S24_L002 A24_S6_L001
2.944 0.9153 1.584 0.8449 1.31
Table continues below
A25_S25_L002 A26_S26_L003 A28_S27_L003 A29_S28_L003 A2_S2_L001
1.575 1.724 3.379 1.664 0.4734
Table continues below
A30_S29_L003 A34_S30_L003 A35_S31_L003 A36_S32_L003 A37_S33_L003
1.08 1.828 1.191 1.9 2.101
Table continues below
A38_S34_L003 A3_S3_L001 A40_S4_L001 A41_S5_L001 A43_S6_L001
1.097 2.109 1.47 0.803 0.979
Table continues below
A43_S8_L001 A4_S4_L001 A5_S5_L001 A6_S6_L001 A7_S7_L001 A8_S8_L001
1.313 1.186 2.279 1.563 2.798 2.145
Table continues below
A9_S1_L001 A9_S9_L001 B22_S3_L001 B23_S35_L003 B24_S36_L003
1.192 0.7917 0.8102 1.146 1.033
Table continues below
B24_S7_L001 B26_S37_L003 B27_S49_L004 B28_S48_L004 B30_S46_L004
2.635 0.774 0.6297 0.9617 0.399
Table continues below
B32_S45_L004 B36_S39_L004 B37_S40_L004 B38_S41_L004 B39_S42_L004
0.8002 2.326 1.078 1.027 2.869
Table continues below
B41_S43_L004 B42_S44_L004 B51_S47_L004 B5_S38_L004 B7_S17_L002
0.6541 0.7117 1.299 0.8812 0.9664
Table continues below
B8_S2_L001 totRNA-10_S33_L004 totRNA-14_S31_L004 totRNA-14_S8_L001
0.6251 0.71 0.5486 0.9464
Table continues below
totRNA-15_S29_L004 totRNA-16_S30_L004 totRNA-17_S28_L004
0.9678 0.6796 0.8932
Table continues below
totRNA-18_S32_L004 totRNA-18_S9_L001 totRNA-2_S35_L004
0.4464 0.7857 0.7076
Table continues below
totRNA-33_S39_L004 totRNA-34_S36_L004 totRNA-35_S37_L004
0.4607 0.808 1.667
Table continues below
totRNA-40_S38_L004 totRNA-47_S27_L004 totRNA-47_S7_L001
0.452 0.5002 0.8702
Table continues below
totRNA-48_S10_L001 totRNA-48_S34_L004 totRNA-48_S9_L001
0.9603 0.6847 0.1121
Table continues below
totRNA-49_S11_L001 totRNA-49_S40_L004 totRNA-50_S12_L001
1.099 0.6193 0.9332
totRNA-50_S41_L004
0.441
boxplot(sizes.kg, main="Sequencing libraries size factor")

3.2 Variance Stabilising Transformation

At the gene level

vsd.kg <- varianceStabilizingTransformation(dds.kg, blind=TRUE)
vst.kg <- assay(vsd.kg)
vst.kg <- vst.kg - min(vst.kg)

Validate the VST

meanSdPlot(vst.kg[rowSums(kg)>0,])

Export the vst

write.csv(vst.kg,"analysis/kallisto/library-size-normalized_variance-stabilized_gene-expression_data.csv")

4 QC on the normalised data

4.1 PCA

".pca" <- function(vst,fact,lgd="topleft",pal=brewer.pal(8,"Dark2")){
  pc <- prcomp(t(vst))
  
  percent <- round(summary(pc)$importance[2,]*100)
  
  #' ### 3 first dimensions
  mar=c(5.1,4.1,4.1,2.1)
  scatterplot3d(pc$x[,1],
                pc$x[,2],
                pc$x[,3],
                xlab=paste("Comp. 1 (",percent[1],"%)",sep=""),
                ylab=paste("Comp. 2 (",percent[2],"%)",sep=""),
                zlab=paste("Comp. 3 (",percent[3],"%)",sep=""),
                color=pal[as.integer(fact)],
                pch=19)
  legend(lgd,pch=19,
         col=pal[1:nlevels(fact)],
         legend=levels(fact))
  par(mar=mar)
}

4.1.1 Batch

.pca(vst.kg,factor(samples$Batch),lgd="topright")

4.1.2 Sex

.pca(vst.kg,factor(samples$Sex),lgd="topright",pal=c(1,pal))

4.1.3 1st and 2nd dims

pc <- prcomp(t(vst.kg))
percent <- round(summary(pc)$importance[2,]*100)

Batch

plot(pc$x[,1],
     pc$x[,2],
     xlab=paste("Comp. 1 (",percent[1],"%)",sep=""),
     ylab=paste("Comp. 2 (",percent[2],"%)",sep=""),
     col=pal[as.integer(factor(samples$Batch))],
     pch=19,
     main="Principal Component Analysis",sub="variance stabilized counts")

legend("topright",bty="n",col=pal,levels(factor(samples$Batch)),pch=19)

Sex

plot(pc$x[,1],
     pc$x[,2],
     xlab=paste("Comp. 1 (",percent[1],"%)",sep=""),
     ylab=paste("Comp. 2 (",percent[2],"%)",sep=""),
     col=c(1,pal)[as.integer(factor(samples$Sex))],
     pch=19,
     main="Principal Component Analysis",sub="variance stabilized counts")

legend("topright",bty="n",col=c(1,pal),levels(samples$Sex),pch=19)

Sampling

samples$Date <- 
  factor(sapply(lapply(strsplit(sub("Oct","10",
               sub("Sep","09",
                   sub("Aug","08",
                       sub("Jun","06",
                           sub("May","05",
                               sub("Apr","04",samples$Sampling)))))),"-"),rev),
         paste,collapse="-"))

plot(pc$x[,1],
     pc$x[,2],
     xlab=paste("Comp. 1 (",percent[1],"%)",sep=""),
     ylab=paste("Comp. 2 (",percent[2],"%)",sep=""),
     col=pal12[as.integer(factor(samples$Date))],
     pch=19,
     main="Principal Component Analysis",sub="variance stabilized counts")

legend("topright",bty="n",col=pal12,levels(samples$Date),pch=19)

4.1.4 2nd and 3rd dims

Batch

plot(pc$x[,2],
     pc$x[,3],
     xlab=paste("Comp. 2 (",percent[2],"%)",sep=""),
     ylab=paste("Comp. 3 (",percent[3],"%)",sep=""),
     col=pal[as.integer(factor(samples$Batch))],
     pch=19,
     main="Principal Component Analysis",sub="variance stabilized counts")

legend("topright",bty="n",col=pal,levels(factor(samples$Batch)),pch=19)

Sex

plot(pc$x[,2],
     pc$x[,3],
     xlab=paste("Comp. 2 (",percent[2],"%)",sep=""),
     ylab=paste("Comp. 3 (",percent[3],"%)",sep=""),
     col=c(1,pal)[as.integer(samples$Sex)],
     pch=19,
     main="Principal Component Analysis",sub="variance stabilized counts")

legend("topright",bty="n",col=c(1,pal),levels(samples$Sex),pch=19)

Sampling

plot(pc$x[,2],
     pc$x[,3],
     xlab=paste("Comp. 2 (",percent[2],"%)",sep=""),
     ylab=paste("Comp. 3 (",percent[3],"%)",sep=""),
     col=pal12[as.integer(factor(samples$Date))],
     pch=19,
     main="Principal Component Analysis",sub="variance stabilized counts")

legend("topright",bty="n",col=pal12,levels(samples$Date),pch=19)

4.2 Heatmap

sel <- featureSelect(vst.kg,samples$Biol_repl_ID,exp = 5,nrep = 2)
heatmap.2(vst.kg[sel,],trace = "none",labRow = FALSE, 
          col=hpal, ColSideColors = c(1,pal)[as.integer(samples$Sex)],
          scale="row")

4.2.1 Saturate the expression

vst.sat <- vst.kg[sel,]
vst.sat[vst.sat<3] <- 3
vst.sat[vst.sat>8] <- 8

heatmap.2(vst.sat,trace = "none",labRow = FALSE, 
          col=hpal, ColSideColors = c(1,pal)[as.integer(samples$Sex)])

4.2.2 Sample Hierarchical clustering

hc <- hclust(dist(t(vst.sat)))
plot(hc, labels=samples$ID,
     main = "Hierarchical clustering",cex=0.7)

5 Merge the technical replicates

counts <- do.call(
  cbind,
  lapply(split.data.frame(t(kg),
                          samples$ID),
         colSums))

csamples <- samples[,-1]
csamples <- csamples[match(colnames(counts),csamples$ID),]
write.csv(csamples,file="~/Git/UPSCb/projects/spruce-cone-development/doc/biological-samples.csv")

5.1 QC

The cumulative gene coverage is as expected

plot(density(log10(rowMeans(counts))),col=pal[1],
     main="gene mean raw counts distribution",
     xlab="mean raw counts (log10)")

The same is done for the individual samples colored by sex.

plot.multidensity(lapply(1:ncol(counts),function(k){log10(counts)[,k]}),
                  col=c(1,pal)[as.integer(csamples$Sex)],
                  legend.x="topright",
                  legend=levels(csamples$Sex),
                  legend.col=c(1,pal)[1:nlevels(csamples$Sex)],
                  legend.lwd=2,
                  main="sample raw counts distribution",
                  xlab="per gene raw counts (log10)")

and by batch.

plot.multidensity(lapply(1:ncol(counts),function(k){log10(counts)[,k]}),
                  col=pal[as.integer(csamples$Batch)],
                  legend.x="topright",
                  legend=levels(csamples$Batch),
                  legend.col=pal[1:nlevels(csamples$Batch)],
                  legend.lwd=2,
                  main="sample raw counts distribution",
                  xlab="per gene raw counts (log10)")

Write the count table (tech. rep. combined)

write.csv(counts,file="analysis/kallisto/raw-unormalised-gene-expression-tech-rep-combined_data.csv")

6 Data normalisation

6.1 Preparation

For visualization, the data is submitted to a variance stabilization transformation using DESeq2. The dispersion is estimated independently of the sample tissue and replicate ### Original data Create the dds object, without giving any prior on the design

dds <- DESeqDataSetFromMatrix(
  countData = counts,
  colData = csamples[,c("ID","Biol_repl_ID","Sex","Type","Sampling","Date","Batch","Tree")],
  design = ~Biol_repl_ID)
## converting counts to integer mode

Check the size factors (i.e. the sequencing library size effect)

dds <- estimateSizeFactors(dds)
sizes <- sizeFactors(dds)
names(sizes) <- colnames(counts)
pander(sizes)
Table continues below
F1-04-10 F1-08-18 F1-09-12 F1-10-08 F1a-08-01 F1a-08-19 F1a-09-16
1.21 0.6307 1.275 1.686 1.138 1.21 0.646
Table continues below
F1b-08-01 F1b-08-19 F1b-09-16 F1c-08-01 F1c-08-19 F1c-09-16
0.3358 0.08201 1.12 1.492 1.429 1.525
Table continues below
F1d-08-01 F2-09-12 F2-10-08 F3-08-18 F3-09-12 F3-10-08 F4-09-12
1.488 1.533 1.138 1.057 1.459 1.426 1.509
Table continues below
F4-10-08 F5-09-12 F5-10-08 F7-04-29 F8-04-10 F8-04-29 F9-04-10
1.425 1.417 1.435 1.238 0.283 2.603 0.446
Table continues below
F9-04-29 FA1a-08-01 FA1a-10-18 FA1b-08-01 FA1b-10-18 FA1c-08-01
0.5746 1.72 2.385 1.433 1.178 1.896
Table continues below
FA1c-10-18 M1-04-10 M1-08-18 M1-09-12 M1-10-08 M1a-08-01 M1b-08-01
0.7604 0.3266 0.9689 1.416 1.502 1.035 0.5671
Table continues below
M1c-08-01 M2-09-12 M2-10-08 M3-08-18 M3-09-12 M3-10-08 M4-09-12
1.617 1.405 1.756 0.4807 1.184 1.334 1.389
Table continues below
M4-10-08 M5-09-12 M5-10-08 M7-04-29 M8-04-10 M8-04-29 M9-04-10
1.387 1.572 1.361 0.5096 0.5717 0.8169 0.6855
Table continues below
M9-04-29 O7-06-11 O8-06-11 O9-06-11 P7-05-12 P8-05-12 P9-05-12
0.5533 0.5729 1.653 0.7267 0.3276 0.4797 0.5165
Table continues below
S7-06-11 S8-06-11 S9-06-11 TA1a-10-18 TA1b-10-18 TA1c-10-18
1.181 0.7675 2.03 1.189 1.077 1.219
Table continues below
TL1a -08-01 TL1b -08-01 TL1c -08-01 V1-04-10 V1-08-18 V1-10-08
1.982 1.516 1.405 0.9644 0.6844 1.282
Table continues below
V2-10-08 V3-08-18 V3-10-08 V4-10-08 V5-10-08 V7-04-29 V8-04-10
1.239 0.8735 1.865 1.092 1.236 0.5012 0.9124
Table continues below
V8-04-29 V9-04-10 V9-04-29 VL1a-08-01 VL1a-08-19 VL1a-09-16
0.4447 0.6838 0.6241 0.842 2.084 1.114
Table continues below
VL1a-10-25 VL1b-08-01 VL1b-08-19 VL1b-09-16 VL1b-10-25 VL1c-08-01
1.289 1.614 1.327 1.221 0.8407 1.104
VL1c-08-19 VL1c-10-25 VL1d-08-01
2.084 1.341 0.7748
boxplot(sizes, main="Sequencing libraries size factor")

pander(sort(sizes))
Table continues below
F1b-08-19 F8-04-10 M1-04-10 P7-05-12 F1b-08-01 V8-04-29 F9-04-10
0.08201 0.283 0.3266 0.3276 0.3358 0.4447 0.446
Table continues below
P8-05-12 M3-08-18 V7-04-29 M7-04-29 P9-05-12 M9-04-29 M1b-08-01
0.4797 0.4807 0.5012 0.5096 0.5165 0.5533 0.5671
Table continues below
M8-04-10 O7-06-11 F9-04-29 V9-04-29 F1-08-18 F1a-09-16 V9-04-10
0.5717 0.5729 0.5746 0.6241 0.6307 0.646 0.6838
Table continues below
V1-08-18 M9-04-10 O9-06-11 FA1c-10-18 S8-06-11 VL1d-08-01 M8-04-29
0.6844 0.6855 0.7267 0.7604 0.7675 0.7748 0.8169
Table continues below
VL1b-10-25 VL1a-08-01 V3-08-18 V8-04-10 V1-04-10 M1-08-18
0.8407 0.842 0.8735 0.9124 0.9644 0.9689
Table continues below
M1a-08-01 F3-08-18 TA1b-10-18 V4-10-08 VL1c-08-01 VL1a-09-16
1.035 1.057 1.077 1.092 1.104 1.114
Table continues below
F1b-09-16 F2-10-08 F1a-08-01 FA1b-10-18 S7-06-11 M3-09-12
1.12 1.138 1.138 1.178 1.181 1.184
Table continues below
TA1a-10-18 F1a-08-19 F1-04-10 TA1c-10-18 VL1b-09-16 V5-10-08
1.189 1.21 1.21 1.219 1.221 1.236
Table continues below
F7-04-29 V2-10-08 F1-09-12 V1-10-08 VL1a-10-25 VL1b-08-19 M3-10-08
1.238 1.239 1.275 1.282 1.289 1.327 1.334
Table continues below
VL1c-10-25 M5-10-08 M4-10-08 M4-09-12 M2-09-12 TL1c -08-01
1.341 1.361 1.387 1.389 1.405 1.405
Table continues below
M1-09-12 F5-09-12 F4-10-08 F3-10-08 F1c-08-19 FA1b-08-01 F5-10-08
1.416 1.417 1.425 1.426 1.429 1.433 1.435
Table continues below
F3-09-12 F1d-08-01 F1c-08-01 M1-10-08 F4-09-12 TL1b -08-01
1.459 1.488 1.492 1.502 1.509 1.516
Table continues below
F1c-09-16 F2-09-12 M5-09-12 VL1b-08-01 M1c-08-01 O8-06-11 F1-10-08
1.525 1.533 1.572 1.614 1.617 1.653 1.686
Table continues below
FA1a-08-01 M2-10-08 V3-10-08 FA1c-08-01 TL1a -08-01 S9-06-11
1.72 1.756 1.865 1.896 1.982 2.03
VL1c-08-19 VL1a-08-19 FA1a-10-18 F8-04-29
2.084 2.084 2.385 2.603

6.2 Variance Stabilising Transformation

At the gene level

vsd <- varianceStabilizingTransformation(dds, blind=TRUE)
vst <- assay(vsd)
vst <- vst - min(vst)

Validate the VST

meanSdPlot(vst[rowSums(vst)>0,])

Export the vst

write.csv(vst,"analysis/kallisto/library-size-normalized_variance-stabilized_gene-expression-tech-rep-combined_data.csv")

7 QC on the normalised data

7.1 PCA

".pca" <- function(vst,fact,lgd="topleft",pal=brewer.pal(8,"Dark2")){
  pc <- prcomp(t(vst))
  
  percent <- round(summary(pc)$importance[2,]*100)
  
  #' ### 3 first dimensions
  mar=c(5.1,4.1,4.1,2.1)
  scatterplot3d(pc$x[,1],
                pc$x[,2],
                pc$x[,3],
                xlab=paste("Comp. 1 (",percent[1],"%)",sep=""),
                ylab=paste("Comp. 2 (",percent[2],"%)",sep=""),
                zlab=paste("Comp. 3 (",percent[3],"%)",sep=""),
                color=pal[as.integer(fact)],
                pch=19)
  legend(lgd,pch=19,
         col=pal[1:nlevels(fact)],
         legend=levels(fact))
  par(mar=mar)
}

7.1.1 Batch

.pca(vst,factor(csamples$Batch),lgd="topright")

7.1.2 Sex

.pca(vst,factor(csamples$Sex),lgd="topright",pal=c(1,pal))

7.1.3 1st and 2nd dims

pc <- prcomp(t(vst))
percent <- round(summary(pc)$importance[2,]*100)

Batch

plot(pc$x[,1],
     pc$x[,2],
     xlab=paste("Comp. 1 (",percent[1],"%)",sep=""),
     ylab=paste("Comp. 2 (",percent[2],"%)",sep=""),
     col=pal[as.integer(factor(csamples$Batch))],
     pch=19,
     main="Principal Component Analysis",sub="variance stabilized counts")

legend("topright",bty="n",col=pal,levels(factor(csamples$Batch)),pch=19)

Sex

plot(pc$x[,1],
     pc$x[,2],
     xlab=paste("Comp. 1 (",percent[1],"%)",sep=""),
     ylab=paste("Comp. 2 (",percent[2],"%)",sep=""),
     col=c(1,pal)[as.integer(factor(csamples$Sex))],
     pch=19,
     main="Principal Component Analysis",sub="variance stabilized counts")

legend("topright",bty="n",col=c(1,pal),levels(csamples$Sex),pch=19)

Sampling

plot(pc$x[,1],
     pc$x[,2],
     xlab=paste("Comp. 1 (",percent[1],"%)",sep=""),
     ylab=paste("Comp. 2 (",percent[2],"%)",sep=""),
     col=pal12[as.integer(factor(csamples$Date))],
     pch=19,
     main="Principal Component Analysis",sub="variance stabilized counts")

legend("topright",bty="n",col=pal12,levels(csamples$Date),pch=19)

7.1.4 2nd and 3rd dims

Batch

plot(pc$x[,2],
     pc$x[,3],
     xlab=paste("Comp. 2 (",percent[2],"%)",sep=""),
     ylab=paste("Comp. 3 (",percent[3],"%)",sep=""),
     col=pal[as.integer(factor(csamples$Batch))],
     pch=19,
     main="Principal Component Analysis",sub="variance stabilized counts")

legend("topright",bty="n",col=pal,levels(factor(csamples$Batch)),pch=19)

Sex

plot(pc$x[,2],
     pc$x[,3],
     xlab=paste("Comp. 2 (",percent[2],"%)",sep=""),
     ylab=paste("Comp. 3 (",percent[3],"%)",sep=""),
     col=c(1,pal)[as.integer(csamples$Sex)],
     pch=19,
     main="Principal Component Analysis",sub="variance stabilized counts")

legend("topright",bty="n",col=c(1,pal),levels(csamples$Sex),pch=19)

Sampling

plot(pc$x[,2],
     pc$x[,3],
     xlab=paste("Comp. 2 (",percent[2],"%)",sep=""),
     ylab=paste("Comp. 3 (",percent[3],"%)",sep=""),
     col=pal12[as.integer(factor(csamples$Date))],
     pch=19,
     main="Principal Component Analysis",sub="variance stabilized counts")

legend("topright",bty="n",col=pal12,levels(csamples$Date),pch=19)

7.2 Heatmap

sel <- featureSelect(vst,csamples$Biol_repl_ID,exp = 5,nrep = 2)
heatmap.2(vst[sel,],trace = "none",labRow = FALSE, 
          col=hpal, ColSideColors = c(1,pal)[as.integer(csamples$Sex)],
          scale="row")

7.2.1 Saturate the expression

vst.sat <- vst[sel,]
vst.sat[vst.sat<3] <- 3
vst.sat[vst.sat>8] <- 8

heatmap.2(vst.sat,trace = "none",labRow = FALSE, 
          col=hpal, ColSideColors = c(1,pal)[as.integer(csamples$Sex)])

7.2.2 Sample Hierarchical clustering

hc <- hclust(dist(t(vst.sat)))
plot(hc, labels=csamples$ID,
     main = "Hierarchical clustering",cex=0.7)

plot(hc, labels=round(sizes[csamples$ID],digits = 2),
     main = "Hierarchical clustering",cex=0.7)

8 Conclusion

The data quality looks overall good. There is some covariates that will need to be taken into account in the analysis, principaly the sampling date. This is confounded with the batch effect. Some direct comparisons will be counfonded with time and it might be difficult to disentangle that effect. We could try to block it on the whole dataset, but we will have an unbalanced design.

The technical replicates…

9 Session Info

## R version 3.4.0 (2017-04-21)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 16.04.2 LTS
## 
## Matrix products: default
## BLAS: /usr/lib/openblas-base/libblas.so.3
## LAPACK: /usr/lib/libopenblasp-r0.2.18.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] hexbin_1.27.1              vsn_3.44.0                
##  [3] tximport_1.4.0             scatterplot3d_0.3-40      
##  [5] RColorBrewer_1.1-2         pander_0.6.0              
##  [7] gplots_3.0.1               DESeq2_1.16.1             
##  [9] SummarizedExperiment_1.6.3 DelayedArray_0.2.7        
## [11] matrixStats_0.52.2         Biobase_2.36.2            
## [13] GenomicRanges_1.28.3       GenomeInfoDb_1.12.2       
## [15] IRanges_2.10.2             S4Vectors_0.14.3          
## [17] BiocGenerics_0.22.0       
## 
## loaded via a namespace (and not attached):
##  [1] bit64_0.9-7             splines_3.4.0          
##  [3] gtools_3.5.0            Formula_1.2-2          
##  [5] affy_1.54.0             latticeExtra_0.6-28    
##  [7] blob_1.1.0              GenomeInfoDbData_0.99.0
##  [9] yaml_2.1.14             RSQLite_2.0            
## [11] backports_1.1.0         lattice_0.20-35        
## [13] limma_3.32.2            digest_0.6.12          
## [15] XVector_0.16.0          checkmate_1.8.3        
## [17] colorspace_1.3-2        preprocessCore_1.38.1  
## [19] htmltools_0.3.6         Matrix_1.2-10          
## [21] plyr_1.8.4              XML_3.98-1.9           
## [23] genefilter_1.58.1       zlibbioc_1.22.0        
## [25] xtable_1.8-2            scales_0.4.1           
## [27] gdata_2.18.0            affyio_1.46.0          
## [29] BiocParallel_1.10.1     htmlTable_1.9          
## [31] tibble_1.3.3            annotate_1.54.0        
## [33] ggplot2_2.2.1           nnet_7.3-12            
## [35] lazyeval_0.2.0          survival_2.41-3        
## [37] magrittr_1.5            memoise_1.1.0          
## [39] evaluate_0.10.1         foreign_0.8-69         
## [41] BiocInstaller_1.26.0    tools_3.4.0            
## [43] data.table_1.10.4       hms_0.3                
## [45] stringr_1.2.0           munsell_0.4.3          
## [47] locfit_1.5-9.1          cluster_2.0.6          
## [49] AnnotationDbi_1.38.1    compiler_3.4.0         
## [51] caTools_1.17.1          rlang_0.1.1            
## [53] rhdf5_2.20.0            grid_3.4.0             
## [55] RCurl_1.95-4.8          htmlwidgets_0.9        
## [57] labeling_0.3            bitops_1.0-6           
## [59] base64enc_0.1-3         rmarkdown_1.6          
## [61] gtable_0.2.0            DBI_0.7                
## [63] R6_2.2.2                gridExtra_2.2.1        
## [65] knitr_1.16              bit_1.1-12             
## [67] Hmisc_4.0-3             rprojroot_1.2          
## [69] readr_1.1.1             KernSmooth_2.23-15     
## [71] stringi_1.1.5           Rcpp_0.12.11           
## [73] geneplotter_1.54.0      rpart_4.1-11           
## [75] acepack_1.4.1